library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
### TRAIN DATA
train <- read_csv("train_reg.csv")
## Rows: 2942 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): q_demos_state
## dbl (34): year, month, order_totals, log_total, count, count_female, count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
amazon_order_train <- read_csv("amazon_order_details_train.csv")
## Rows: 913512 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): shipping_address_state, title, asin_isbn_product_code, category, s...
## dbl (3): purchase_price_per_unit, quantity, item_cost
## date (1): order_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
customer_info_train <- read_csv("customer_info_train.csv")
## Rows: 2512 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): survey_response_id, q_demos_age, q_demos_hispanic, q_demos_race, q...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### TEST DATA
test <- read_csv("test_reg.csv")
## Rows: 2952 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): q_demos_state
## dbl (33): id, year, month, count, count_female, count_male, count_less5, cou...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
amazon_order_test <- read_csv("amazon_order_details_test.csv")
## Rows: 896514 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): shipping_address_state, category, survey_response_id
## dbl (1): quantity
## date (1): order_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
customer_info_test <- read_csv("customer_info_test.csv")
## Rows: 2513 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): survey_response_id, q_demos_age, q_demos_hispanic, q_demos_race, q...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train)
## # A tibble: 6 × 35
## q_demos_state year month order_totals log_total count count_female count_male
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 2018 1 1774. 3.25 53 49 4
## 2 Alabama 2018 2 2015. 3.30 49 47 2
## 3 Alabama 2018 3 1689. 3.23 51 48 3
## 4 Alabama 2018 4 3304. 3.52 47 42 5
## 5 Alabama 2018 5 1923. 3.28 43 41 2
## 6 Alabama 2018 6 2497. 3.40 62 57 5
## # ℹ 27 more variables: count_less5 <dbl>, count_5to10 <dbl>,
## # count_over10 <dbl>, count_hh1 <dbl>, count_hh2 <dbl>, count_hh3 <dbl>,
## # count_hh4 <dbl>, count_howmany1 <dbl>, count_howmany2 <dbl>,
## # count_howmany3 <dbl>, count_howmany4 <dbl>, count_1824 <dbl>,
## # count_2534 <dbl>, count_3544 <dbl>, count_4554 <dbl>, count_5564 <dbl>,
## # count_65up <dbl>, count_und25k <dbl>, count_2549k <dbl>, count_5074k <dbl>,
## # count_7599k <dbl>, count_100149k <dbl>, count_150kup <dbl>, …
dim(train)
## [1] 2942 35
colnames(train)
## [1] "q_demos_state" "year" "month" "order_totals"
## [5] "log_total" "count" "count_female" "count_male"
## [9] "count_less5" "count_5to10" "count_over10" "count_hh1"
## [13] "count_hh2" "count_hh3" "count_hh4" "count_howmany1"
## [17] "count_howmany2" "count_howmany3" "count_howmany4" "count_1824"
## [21] "count_2534" "count_3544" "count_4554" "count_5564"
## [25] "count_65up" "count_und25k" "count_2549k" "count_5074k"
## [29] "count_7599k" "count_100149k" "count_150kup" "count_lessHS"
## [33] "count_HS" "count_B" "count_G"
train <- train %>% dplyr::select(!order_totals) # remove order_totals column
summary(car::powerTransform(cbind(train$count + 0.001, train$count_female+ 0.001,
train$count_male + 0.001, train$count_less5+ 0.001,
train$count_5to10 + 0.001, train$count_over10+ 0.001,
train$count_hh1 + 0.001, train$count_hh2+ 0.001,
train$count_hh3 + 0.001, train$count_hh4+ 0.001,
train$count_howmany1 + 0.001, train$count_howmany2+ 0.001,
train$count_howmany3 + 0.001, train$count_howmany4+ 0.001,
train$count_1824 + 0.001, train$count_2534+ 0.001,
train$count_3544 + 0.001, train$count_4554+ 0.001,
train$count_5564 + 0.001, train$count_65up+ 0.001,
train$count_und25k + 0.001, train$count_2549k+ 0.001,
train$count_5074k + 0.001, train$count_7599k+ 0.001,
train$count_100149k + 0.001, train$count_150kup+ 0.001,
train$count_lessHS + 0.001, train$count_HS+ 0.001,
train$count_B + 0.001, train$count_G+ 0.001
)~1))
## bcPower Transformations to Multinormality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 0.5055 0.51 0.5010 0.5099
## Y2 0.5008 0.50 0.4951 0.5065
## Y3 0.5318 0.53 0.5256 0.5380
## Y4 0.5007 0.50 0.4939 0.5074
## Y5 0.5400 0.54 0.5341 0.5460
## Y6 0.4925 0.50 0.4828 0.5022
## Y7 0.5074 0.50 0.4992 0.5156
## Y8 0.5448 0.54 0.5380 0.5516
## Y9 0.4955 0.50 0.4868 0.5042
## Y10 0.5314 0.53 0.5243 0.5386
## Y11 0.5228 0.52 0.5151 0.5306
## Y12 0.5343 0.53 0.5250 0.5437
## Y13 0.2078 0.21 0.1942 0.2213
## Y14 0.0890 0.09 0.0755 0.1026
## Y15 0.3343 0.33 0.3235 0.3451
## Y16 0.5339 0.53 0.5247 0.5431
## Y17 0.5610 0.56 0.5508 0.5711
## Y18 0.5048 0.50 0.4940 0.5156
## Y19 0.3666 0.37 0.3546 0.3787
## Y20 -0.0023 0.00 -0.0163 0.0117
## Y21 0.3380 0.33 0.3257 0.3504
## Y22 0.4845 0.48 0.4727 0.4963
## Y23 0.5183 0.52 0.5066 0.5300
## Y24 0.5045 0.50 0.4923 0.5166
## Y25 0.5136 0.51 0.5030 0.5242
## Y26 0.3540 0.35 0.3417 0.3662
## Y27 -0.5366 -0.54 -0.5595 -0.5138
## Y28 0.5254 0.53 0.5189 0.5319
## Y29 0.5068 0.50 0.5000 0.5136
## Y30 0.5217 0.52 0.5133 0.5302
##
## Likelihood ratio test that transformation parameters are equal to 0
## (all log transformations)
## LRT
## LR test, lambda = (0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) 145960.4
## df
## LR test, lambda = (0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) 30
## pval
## LR test, lambda = (0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) < 2.22e-16
##
## Likelihood ratio test that no transformations are needed
## LRT
## LR test, lambda = (1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1) -144891.6
## df
## LR test, lambda = (1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1) 30
## pval
## LR test, lambda = (1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1) 1
summary(car::powerTransform(cbind(train$month, train$factor_state, train$year + 0.001
)~1))
## Warning: Unknown or uninitialised column: `factor_state`.
## bcPower Transformations to Multinormality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 0.726 0.73 0.6683 0.7836
## Y2 3.000 1.00 -54.3107 60.3106
##
## Likelihood ratio test that transformation parameters are equal to 0
## (all log transformations)
## LRT df pval
## LR test, lambda = (0 0) 694.8659 2 < 2.22e-16
##
## Likelihood ratio test that no transformations are needed
## LRT df pval
## LR test, lambda = (1 1) 82.14562 2 < 2.22e-16
train_transformed <- train %>% dplyr::mutate(count_tf = sqrt(count),
count_female_tf = sqrt(count_female),
count_male_tf = sqrt(count_male),
count_less5_tf = sqrt(count_less5),
count_5to10_tf = sqrt(count_5to10),
count_over10_tf = sqrt(count_over10),
count_hh1_tf = sqrt(count_hh1),
count_hh2_tf = sqrt(count_hh2),
count_hh3_tf = sqrt(count_hh3),
count_hh4_tf = sqrt(count_hh4),
count_howmany1_tf = sqrt(count_howmany1),
count_howmany2_tf = sqrt(count_howmany2),
count_howmany3_tf = (count_howmany3)^ 0.21,
count_howmany4_tf = log(count_howmany4 + 0.001), # recommended 0.09 which is about 0 --> dealing with log
count_1824_tf = (count_1824)^0.33,
count_2534_tf = sqrt(count_2534),
count_3544_tf = sqrt(count_3544),
count_4554_tf = sqrt(count_4554),
count_5564_tf = (count_5564)^0.37,
count_65up_tf = log10(count_65up + 0.001), # be careful of log for this one
count_und25k_tf = (count_und25k)^0.33,
count_2549k_tf = sqrt(count_2549k),
count_5074k_tf = sqrt(count_5074k),
count_7599k_tf = sqrt(count_7599k),
count_101149k_tf = sqrt(count_100149k),
count_150kup_tf = (count_150kup)^0.35,
count_lessHS_tf = 1 / (sqrt(count_lessHS) + + 0.001),
count_HS_tf = sqrt(count_HS),
count_B_tf = sqrt(count_B),
count_G_tf = sqrt(count_G),
)
## Any NA values?
## Result: No
check_na <- function(data) {
print(which(is.na(data)))
print(sum(is.na(data)))
}
check_na(train)
## integer(0)
## [1] 0
All right skewed
count_female_hist <- ggplot(train, aes(x = count_female)) + geom_histogram(bins = 30)
count_male_hist <- ggplot(train, aes(x = count_male)) + geom_histogram(bins = 30)
count_hist <- ggplot(train, aes(x = count)) + geom_histogram(bins = 30)
grid.arrange(count_female_hist, count_male_hist, count_hist)
count_less5_hist <- ggplot(train, aes(x = count_less5)) + geom_histogram(bins = 30)
count_5to10_hist <- ggplot(train, aes(x = count_5to10)) + geom_histogram(bins = 30)
count_over10_hist <- ggplot(train, aes(x = count_over10)) + geom_histogram(bins = 30)
grid.arrange(count_less5_hist, count_5to10_hist, count_over10_hist)
count_hh1_hist <- ggplot(train, aes(x = count_hh1)) + geom_histogram(bins = 30)
count_hh2_hist <- ggplot(train, aes(x = count_hh2)) + geom_histogram(bins = 30)
count_hh3_hist <- ggplot(train, aes(x = count_hh3)) + geom_histogram(bins = 30)
count_hh4_hist <- ggplot(train, aes(x = count_hh4)) + geom_histogram(bins = 30)
grid.arrange(count_hh1_hist, count_hh2_hist, count_hh3_hist, count_hh4_hist)
count_how_many1_hist <- ggplot(train, aes(x = count_howmany1)) + geom_histogram(bins = 30)
count_how_many2_hist <- ggplot(train, aes(x = count_howmany2)) + geom_histogram(bins = 30)
count_how_many3_hist <- ggplot(train, aes(x = count_howmany3)) + geom_histogram(bins = 30)
count_how_many4_hist <- ggplot(train, aes(x = count_howmany4)) + geom_histogram(bins = 30)
grid.arrange(count_how_many1_hist, count_how_many2_hist, count_how_many3_hist, count_how_many4_hist)
count_1824_hist <- ggplot(train, aes(x = count_1824)) + geom_histogram(bins = 30)
count_2534_hist <- ggplot(train, aes(x = count_2534)) + geom_histogram(bins = 30)
count_3544_hist <- ggplot(train, aes(x = count_3544)) + geom_histogram(bins = 30)
count_4554_hist <- ggplot(train, aes(x = count_4554)) + geom_histogram(bins = 30)
count_5564_hist <- ggplot(train, aes(x = count_5564)) + geom_histogram(bins = 30)
count_65up_hist <- ggplot(train, aes(x = count_65up)) + geom_histogram(bins = 30)
grid.arrange(count_1824_hist, count_2534_hist, count_3544_hist, count_4554_hist, count_5564_hist, count_65up_hist)
count_und25k_hist <- ggplot(train, aes(x = count_und25k)) + geom_histogram(bins = 30)
count_2549k_hist <- ggplot(train, aes(x = count_2549k)) + geom_histogram(bins = 30)
count_5074k_hist <- ggplot(train, aes(x = count_5074k)) + geom_histogram(bins = 30)
count_7599k_hist <- ggplot(train, aes(x = count_7599k)) + geom_histogram(bins = 30)
count_100149k_hist <- ggplot(train, aes(x = count_100149k)) + geom_histogram(bins = 30)
count_150kup_hist <- ggplot(train, aes(x = count_150kup)) + geom_histogram(bins = 30)
grid.arrange(count_und25k_hist, count_2534_hist, count_5074k_hist, count_7599k_hist, count_100149k_hist, count_150kup_hist)
count_lessHS_hist <- ggplot(train, aes(x = count_lessHS)) + geom_histogram(bins = 30)
count_HS_hist <- ggplot(train, aes(x = count_HS)) + geom_histogram(bins = 30)
count_B_hist <- ggplot(train, aes(x = count_B)) + geom_histogram(bins = 30)
count_G_hist <- ggplot(train, aes(x = count_G)) + geom_histogram(bins = 30)
grid.arrange(count_lessHS_hist, count_HS_hist, count_B_hist, count_G_hist)
## Outliers
count_female_box <- ggplot(train, aes(x = count_female)) + geom_boxplot()
count_male_box <- ggplot(train, aes(x = count_male)) + geom_boxplot()
count_box <- ggplot(train, aes(x = count)) + geom_boxplot()
grid.arrange(count_female_box, count_male_box, count_box)
count_less5_box <- ggplot(train, aes(x = count_less5)) + geom_boxplot()
count_5to10_box <- ggplot(train, aes(x = count_5to10)) + geom_boxplot()
count_over10_box <- ggplot(train, aes(x = count_over10)) + geom_boxplot()
grid.arrange(count_less5_box, count_5to10_box, count_over10_box)
count_hh1_box <- ggplot(train, aes(x = count_hh1)) + geom_boxplot()
count_hh2_box <- ggplot(train, aes(x = count_hh2)) + geom_boxplot()
count_hh3_box <- ggplot(train, aes(x = count_hh3)) + geom_boxplot()
count_hh4_box <- ggplot(train, aes(x = count_hh4)) + geom_boxplot()
grid.arrange(count_hh1_box, count_hh2_box, count_hh3_box, count_hh4_box)
count_how_many1_box <- ggplot(train, aes(x = count_howmany1)) + geom_boxplot()
count_how_many2_box <- ggplot(train, aes(x = count_howmany2)) + geom_boxplot()
count_how_many3_box <- ggplot(train, aes(x = count_howmany3)) + geom_boxplot()
count_how_many4_box <- ggplot(train, aes(x = count_howmany4)) + geom_boxplot()
grid.arrange(count_how_many1_box, count_how_many2_box, count_how_many3_box, count_how_many4_box)
count_1824_box <- ggplot(train, aes(x = count_1824)) + geom_boxplot()
count_2534_box <- ggplot(train, aes(x = count_2534)) + geom_boxplot()
count_3544_box <- ggplot(train, aes(x = count_3544)) + geom_boxplot()
count_4554_box <- ggplot(train, aes(x = count_4554)) + geom_boxplot()
count_5564_box <- ggplot(train, aes(x = count_5564)) + geom_boxplot()
count_65up_box <- ggplot(train, aes(x = count_65up)) + geom_boxplot()
grid.arrange(count_1824_box, count_2534_box, count_3544_box, count_4554_box, count_5564_box, count_65up_box)
count_und25k_box <- ggplot(train, aes(x = count_und25k)) + geom_boxplot()
count_2549k_box <- ggplot(train, aes(x = count_2549k)) + geom_boxplot()
count_5074k_box <- ggplot(train, aes(x = count_5074k)) + geom_boxplot()
count_7599k_box <- ggplot(train, aes(x = count_7599k)) + geom_boxplot()
count_100149k_box <- ggplot(train, aes(x = count_100149k)) + geom_boxplot()
count_150kup_box <- ggplot(train, aes(x = count_150kup)) + geom_boxplot()
grid.arrange(count_und25k_box, count_2534_box, count_5074k_box, count_7599k_box, count_100149k_box, count_150kup_box)
count_lessHS_box <- ggplot(train, aes(x = count_lessHS)) + geom_boxplot()
count_HS_box <- ggplot(train, aes(x = count_HS)) + geom_boxplot()
count_B_box <- ggplot(train, aes(x = count_B)) + geom_boxplot()
count_G_box <- ggplot(train, aes(x = count_G)) + geom_boxplot()
grid.arrange(count_lessHS_box, count_HS_box, count_B_box, count_G_box)
count_female_hist <- ggplot(train_transformed, aes(x = count_female_tf)) + geom_histogram(bins = 30)
count_male_hist <- ggplot(train_transformed, aes(x = count_male_tf)) + geom_histogram(bins = 30)
count_hist <- ggplot(train_transformed, aes(x = count_tf)) + geom_histogram(bins = 30)
grid.arrange(count_female_hist, count_male_hist, count_hist)
count_less5_hist <- ggplot(train_transformed, aes(x = count_less5_tf)) + geom_histogram(bins = 30)
count_5to10_hist <- ggplot(train_transformed, aes(x = count_5to10_tf)) + geom_histogram(bins = 30)
count_over10_hist <- ggplot(train_transformed, aes(x = count_over10_tf)) + geom_histogram(bins = 30)
grid.arrange(count_less5_hist, count_5to10_hist, count_over10_hist)
count_hh1_hist <- ggplot(train_transformed, aes(x = count_hh1_tf)) + geom_histogram(bins = 30)
count_hh2_hist <- ggplot(train_transformed, aes(x = count_hh2_tf)) + geom_histogram(bins = 30)
count_hh3_hist <- ggplot(train_transformed, aes(x = count_hh3_tf)) + geom_histogram(bins = 30)
count_hh4_hist <- ggplot(train_transformed, aes(x = count_hh4_tf)) + geom_histogram(bins = 30)
grid.arrange(count_hh1_hist, count_hh2_hist, count_hh3_hist, count_hh4_hist)
count_how_many1_hist <- ggplot(train_transformed, aes(x = count_howmany1_tf)) + geom_histogram(bins = 30)
count_how_many2_hist <- ggplot(train_transformed, aes(x = count_howmany2_tf)) + geom_histogram(bins = 30)
count_how_many3_hist <- ggplot(train_transformed, aes(x = count_howmany3_tf)) + geom_histogram(bins = 30)
count_how_many4_hist <- ggplot(train_transformed, aes(x = count_howmany4_tf)) + geom_histogram(bins = 30)
grid.arrange(count_how_many1_hist, count_how_many2_hist, count_how_many3_hist, count_how_many4_hist)
count_1824_hist <- ggplot(train_transformed, aes(x = count_1824_tf)) + geom_histogram(bins = 30)
count_2534_hist <- ggplot(train_transformed, aes(x = count_2534_tf)) + geom_histogram(bins = 30)
count_3544_hist <- ggplot(train_transformed, aes(x = count_3544_tf)) + geom_histogram(bins = 30)
count_4554_hist <- ggplot(train_transformed, aes(x = count_4554_tf)) + geom_histogram(bins = 30)
count_5564_hist <- ggplot(train_transformed, aes(x = count_5564_tf)) + geom_histogram(bins = 30)
count_65up_hist <- ggplot(train_transformed, aes(x = count_65up_tf)) + geom_histogram(bins = 30)
grid.arrange(count_1824_hist, count_2534_hist, count_3544_hist, count_4554_hist, count_5564_hist, count_65up_hist)
count_und25k_hist <- ggplot(train_transformed, aes(x = count_und25k_tf)) + geom_histogram(bins = 30)
count_2549k_hist <- ggplot(train_transformed, aes(x = count_2549k_tf)) + geom_histogram(bins = 30)
count_5074k_hist <- ggplot(train_transformed, aes(x = count_5074k_tf)) + geom_histogram(bins = 30)
count_7599k_hist <- ggplot(train_transformed, aes(x = count_7599k_tf)) + geom_histogram(bins = 30)
count_100149k_hist <- ggplot(train_transformed, aes(x = count_101149k_tf)) + geom_histogram(bins = 30)
count_150kup_hist <- ggplot(train_transformed, aes(x = count_150kup_tf)) + geom_histogram(bins = 30)
grid.arrange(count_und25k_hist, count_2534_hist, count_5074k_hist, count_7599k_hist, count_100149k_hist, count_150kup_hist)
count_lessHS_hist <- ggplot(train_transformed, aes(x = count_lessHS_tf)) + geom_histogram(bins = 30)
count_HS_hist <- ggplot(train_transformed, aes(x = count_HS_tf)) + geom_histogram(bins = 30)
count_B_hist <- ggplot(train_transformed, aes(x = count_B_tf)) + geom_histogram(bins = 30)
count_G_hist <- ggplot(train_transformed, aes(x = count_G_tf)) + geom_histogram(bins = 30)
grid.arrange(count_lessHS_hist, count_HS_hist, count_B_hist, count_G_hist)
pairs(log_total ~ count + count_female + count_male + count_less5 + count_5to10 + count_over10, data = train)
pairs(log_total ~ count_hh1 + count_hh2 + count_hh3 + count_hh4 + count_howmany1 + count_howmany2 + count_howmany3 + count_howmany4, data = train)
pairs(log_total ~ count_1824 + count_2534 + count_3544 + count_4554 + count_5564 + count_65up, data = train)
pairs(log_total ~ count_und25k + count_2549k + count_5074k + count_7599k + count_100149k + count_150kup + count_lessHS +count_HS + count_B + count_G, data = train)
pairs(log_total ~ count_tf + count_female_tf + count_male_tf + count_less5_tf + count_5to10_tf + count_over10_tf, data = train_transformed)
pairs(log_total ~ count_hh1_tf + count_hh2_tf + count_hh3_tf + count_hh4_tf + count_howmany1_tf + count_howmany2_tf + count_howmany3_tf + count_howmany4_tf, data = train_transformed)
pairs(log_total ~ count_1824_tf + count_2534_tf + count_3544_tf + count_4554_tf + count_5564_tf + count_65up_tf, data = train_transformed)
pairs(log_total ~ count_und25k_tf + count_2549k_tf + count_5074k_tf + count_7599k_tf + count_101149k_tf + count_150kup_tf + count_lessHS +count_HS_tf + count_B_tf + count_G_tf, data = train_transformed)
month_vs_log_total_2018 <- ggplot(train[which(train$year == 2018),], aes(x = month, y = log_total)) + geom_col() + xlab("Month") + theme(axis.text = element_text(size = 7))
month_vs_log_total_2019 <- ggplot(train[which(train$year == 2019),], aes(x = month, y = log_total)) + geom_col() + xlab("Month") + theme(axis.text = element_text(size = 7))
month_vs_log_total_2020 <- ggplot(train[which(train$year == 2020),], aes(x = month, y = log_total)) + geom_col() + xlab("Month") + theme(axis.text = element_text(size = 7))
month_vs_log_total_2021 <- ggplot(train[which(train$year == 2021),],aes(x = month, y = log_total)) + geom_col() + xlab("Month") + theme(axis.text = element_text(size = 7))
month_vs_log_total_2022 <- ggplot(train[which(train$year == 2022),], aes(x = month, y = log_total)) + geom_col() + xlab("Month") + theme(axis.text = element_text(size = 7))
grid.arrange(month_vs_log_total_2018, month_vs_log_total_2019, month_vs_log_total_2020, month_vs_log_total_2021, month_vs_log_total_2022)
year <- factor(train$year)
df <- data_frame(train$month, train$log_total, year)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(df, aes(x = train$month, y = train$log_total, fill = year)) +
geom_bar(stat = 'identity', position = 'dodge') + xlab("Month") + ylab("log_total") + ggtitle("Month v.s. Log_Total separated by years 2018-2022") + theme(plot.title = element_text(hjust = 0.5), axis.text = element_text(size = 7))
ggplot(train, aes(year)) + geom_bar() + ggtitle("Number of Observations per Year") + theme(plot.title = element_text(hjust = 0.5), axis.text = element_text(size = 7))
freq_year <- table(train$year)
freq_year
##
## 2018 2019 2020 2021 2022
## 579 582 593 594 594
Results: It looks about the same across years and does not seem to have much significance across months
states_ordered <- sort(unique(train$q_demos_state))
al_fl_boxplot <- ggplot(data = train[which(train$q_demos_state %in% states_ordered[1:10] ), ], mapping = aes(x = q_demos_state, y = log_total)) + geom_boxplot() + theme(axis.text = element_text(size = 3))
ga_me_boxplot <- ggplot(data = train[which(train$q_demos_state %in% states_ordered[11:20] ), ], mapping = aes(x = q_demos_state, y = log_total)) + geom_boxplot() + theme(axis.text = element_text(size = 3))
md_nh_boxplot <- ggplot(data = train[which(train$q_demos_state %in% states_ordered[21:30] ), ], mapping = aes(x = q_demos_state, y = log_total)) + geom_boxplot() + theme(axis.text = element_text(size = 3))
nj_ri_boxplot <- ggplot(data = train[which(train$q_demos_state %in% states_ordered[31:40] ), ], mapping = aes(x = q_demos_state, y = log_total)) + geom_boxplot() + theme(axis.text = element_text(size = 3))
sc_wy_boxplot <- ggplot(data = train[which(train$q_demos_state %in% states_ordered[41:51] ), ], mapping = aes(x = q_demos_state, y = log_total)) + geom_boxplot() + theme(axis.text = element_text(size = 3))
grid.arrange(al_fl_boxplot, ga_me_boxplot, md_nh_boxplot, nj_ri_boxplot, sc_wy_boxplot,
top = textGrob("Boxplot of Log_Total for Each State",gp=gpar(fontsize=20,font=3)))
Results: Most states have pretty high log_totals of about 10^(3.5-4) with the exception of Alaska, Hawaii, Idaho, Montana, North Dakota, Rhode Island, South Dakota, Vermont, and Wyoming who have a range of about 10^(2.5-3)
ggplot(train, aes(x = count_hh1, y = log_total)) + geom_point(aes(color = 'red'), alpha = 0.3) +
geom_point(aes(x = count_hh2, y = log_total, color = 'orange'), alpha = 0.3) +
geom_point(aes(x = count_hh3, y = log_total, color = 'black'), alpha = 0.3) +
geom_point(aes(x = count_hh4, y = log_total, color = 'green'), alpha = 0.3) + labs(title = "Count of Household Size v.s. Log_Total Scatterplot", x = "Count of Household Size", y = "Log_Total") + scale_color_manual(values = c('red', 'orange', 'black', 'green'), labels = c('Household Size of 1', 'Household Size of 2', 'Household Size of 3', 'Household Size of 4+'))
ggplot(train, aes(x = count_less5, y = log_total)) +
geom_smooth(aes(x = count_hh1, y = log_total, color = 'red'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_hh2, y = log_total, color = 'orange'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_hh3, y = log_total, color = 'black'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_hh4, y = log_total, color = 'green'), method = 'loess', se = FALSE) + labs(title = "Count of Household Size v.s. Log_Total Scatterplot", x = "Log_Count of Household Size", y = "Log_Total") + scale_color_manual(values = c('red', 'orange', 'black', 'green'), labels = c('Household Size of 1', 'Household Size of 2', 'Household Size of 3', 'Household Size of 4+'))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(x = count_less5, y = log_total)) +
geom_point(aes(x = count_less5, y = log_total, color = 'red'), alpha = 0.3) +
geom_point(aes(x = count_5to10, y = log_total, color = 'orange'), alpha = 0.3) +
geom_point(aes(x = count_over10, y = log_total, color = 'black'), alpha = 0.3) +
labs(title = "Count of Orders Purchased Per Month v.s. Log_Total Scatterplot", x = "Count of Orders Purchased Per Month", y = "Log_Total") + scale_color_manual(values = c('red', 'orange', 'black'), labels = c('Less than 5 Orders Purchased', 'Count of 5 - 10 Orders Purchased', 'Count of Over 10 Orders Per Month'))
ggplot(train, aes(x = count_less5, y = log_total)) +
geom_smooth(aes(x = count_less5, y = log_total, color = 'red'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_5to10, y = log_total, color = 'orange'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_over10, y = log_total, color = 'black'), method = 'loess', se = FALSE) +
labs(title = "Count of Orders Purchased Per Month v.s. Log_Total Scatterplot", x = "Count of Orders Purchased Per Month", y = "Log_Total") + scale_color_manual(values = c('red', 'orange', 'black'), labels = c('Less than 5 Orders Purchased', 'Count of 5 - 10 Orders Purchased', 'Count of Over 10 Orders Per Month'))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(x = count_female, y = log_total)) +
geom_point(aes(color = "blue")) +
geom_point(aes(x = count_male, y = log_total, color = 'red')) +
labs(title = "Gender v.s. Log_Total Scatterplot", x = "Count of Each Gender", y = "Log_Total") + scale_color_manual(values = c('blue', 'red'), labels = c('Count of Female', 'Male'))
ggplot(train, aes(x = count_1824, y = log_total)) + geom_point(aes(color = 'red'), alpha = 0.3) +
geom_point(aes(x = count_2534, y = log_total, color = 'orange'), alpha = 0.3) +
geom_point(aes(x = count_3544, y = log_total, color = 'black'), alpha = 0.3) +
geom_point(aes(x = count_4554, y = log_total, color = 'green'), alpha = 0.3) +
geom_point(aes(x = count_5564, y = log_total, color = 'blue'), alpha = 0.3) +
geom_point(aes(x = count_65up, y = log_total, color = 'purple'), alpha = 0.3) +
labs(title = "Count of How Many People Are in Each Age Range vs. Log_Total Scatterplot",
x = "Number of People in Each Age Group",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green', 'blue', 'purple'),
labels = c('18-24', '25-34', '35-44', '45-54', '55-64', '65+'))
ggplot(train, aes(x = count_1824, y = log_total)) +
geom_smooth(aes(x = count_1824, y = log_total, color = 'red'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_2534, y = log_total, color = 'orange'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_3544, y = log_total, color = 'black'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_4554, y = log_total, color = 'green'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_5564, y = log_total, color = 'blue'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_65up, y = log_total, color = 'purple'), method = 'loess', se = FALSE) +
labs(title = "Count of How Many People Are in Each Age Range vs. Log_Total Scatterplot",
x = "Number of People in Each Age Group",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green', 'blue', 'purple'),
labels = c('18-24', '25-34', '35-44', '45-54', '55-64', '65+'))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(x = count_und25k, y = log_total)) + geom_point(aes(color = 'red'), alpha = 0.3) +
geom_point(aes(x = count_2549k, y = log_total, color = 'orange'), alpha = 0.3) +
geom_point(aes(x = count_5074k, y = log_total, color = 'black'), alpha = 0.3) +
geom_point(aes(x = count_7599k, y = log_total, color = 'green'), alpha = 0.3) +
geom_point(aes(x = count_100149k, y = log_total, color = 'blue'), alpha = 0.3) +
geom_point(aes(x = count_150kup, y = log_total, color = 'purple'), alpha = 0.3) +
labs(title = "Count of Number of People in Each Income Bracket vs. Log_Total Scatterplot",
x = "Number of People in Each Income Bracket",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green', 'blue', 'purple'),
labels = c('Under 25k', '25-49k', '50-74k', '75-99k', '100-149k', '150k+'))
ggplot(train, aes(x = count_howmany1, y = log_total)) +
labs(title = "Count of Number of People in Each Income Bracket vs. Log_Total Scatterplot",
x = "Number of People in Each Income Bracket",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green', 'blue', 'purple'),
labels = c('Under 25k', '25-49k', '50-74k', '75-99k', '100-149k', '150k+')) +
geom_smooth(aes(x = count_und25k, y = log_total, color = 'red'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_2549k, y = log_total, color = 'orange'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_5074k, y = log_total, color = 'black'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_7599k, y = log_total, color = 'green'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_100149k, y = log_total, color = 'blue'), method = 'loess', se = FALSE) +
geom_smooth(aes(x = count_150kup, y = log_total, color = 'purple'), method = 'loess', se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(x = count_lessHS, y = log_total)) + geom_point(aes(color = 'red'), alpha = 0.3) +
geom_point(aes(x = count_HS, y = log_total, color = 'orange'), alpha = 0.3) +
geom_point(aes(x = count_B, y = log_total, color = 'black'), alpha = 0.3) +
geom_point(aes(x = count_G, y = log_total, color = 'green'), alpha = 0.3) +
labs(title = "Count of Customers with Each Type of Education vs. Log_Total Scatterplot",
x = "Number of Customers with Each Type of Education",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green'),
labels = c('Less Than High School Diploma', 'High School Diploma', 'Bachelor\'s Degree', 'Graduate/Professional Degree'))
ggplot(train, aes(x = count_lessHS, y = log_total)) +
geom_smooth(aes(x = count_lessHS, y = log_total, color = 'red'), method = 'gam', se = FALSE) +
geom_smooth(aes(x = count_HS, y = log_total, color = 'orange'), method = 'gam', se = FALSE) +
geom_smooth(aes(x = count_B, y = log_total, color = 'black'), method = 'gam', se = FALSE) +
geom_smooth(aes(x = count_G, y = log_total, color = 'green'), method = 'gam', se = FALSE) +
labs(title = "Count of Customers with Each Type of Education vs. Log_Total Scatterplot",
x = "Number of Customers with Each Type of Education",
y = "Log_Total") +
scale_color_manual(values = c('red', 'orange', 'black', 'green'),
labels = c('Less Than High School Diploma', 'High School Diploma', 'Bachelor\'s Degree', 'Graduate/Professional Degree'))
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Overall, it seems like Less than HS, and Bachelor’s Degree seem to be
the ones who tend to order more goods (under age < 18 and age >
21) also:
* < 50k people order a lot of the overall total * 18-24 and 45-54
people seem to order a lot of goods as well * 3-4 people = most common #
of people who share an account * Most states have pretty high log_totals
of about 10^3.5-4 (~ 4466 to 10000) with the exception of Alaska,
Hawaii, Idaho, Montana, North Dakota, Rhode Island, South Dakota,
Vermont, and Wyoming who have. arange of about 10^(2.5-3) aka 316 to
10000
So, we should further investigate these variables: customer education, customer income, age range, people who share an account, and states.